{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "10.1 数据收集"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ec9762283429e224"
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-12-13T08:00:18.007290Z",
     "start_time": "2024-12-13T08:00:17.975629Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "   区域        小区名称    户型    面积(㎡)  价格(元/月)\n0  东城     万国城MOMA  1室0厅  59.11平米    10000\n1  东城    北官厅胡同2号院  3室0厅  56.92平米     6000\n2  东城       和平里三区  1室1厅  40.57平米     6900\n3  东城        菊儿胡同  2室1厅  57.09平米     8000\n4  东城  交道口北二条35号院  1室1厅  42.67平米     5500",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>区域</th>\n      <th>小区名称</th>\n      <th>户型</th>\n      <th>面积(㎡)</th>\n      <th>价格(元/月)</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>东城</td>\n      <td>万国城MOMA</td>\n      <td>1室0厅</td>\n      <td>59.11平米</td>\n      <td>10000</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>东城</td>\n      <td>北官厅胡同2号院</td>\n      <td>3室0厅</td>\n      <td>56.92平米</td>\n      <td>6000</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>东城</td>\n      <td>和平里三区</td>\n      <td>1室1厅</td>\n      <td>40.57平米</td>\n      <td>6900</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>东城</td>\n      <td>菊儿胡同</td>\n      <td>2室1厅</td>\n      <td>57.09平米</td>\n      <td>8000</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>东城</td>\n      <td>交道口北二条35号院</td>\n      <td>1室1厅</td>\n      <td>42.67平米</td>\n      <td>5500</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "# 从文件中读取链家网站上采集的北京租房数据\n",
    "file_data = pd.read_csv(r'D:\\Python data analysis\\gitee-python-project\\pythonproject\\李永艳\\lianjia_houses.csv', encoding='gbk')\n",
    "file_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8223 entries, 0 to 8222\n",
      "Data columns (total 5 columns):\n",
      " #   Column   Non-Null Count  Dtype \n",
      "---  ------   --------------  ----- \n",
      " 0   区域       8223 non-null   object\n",
      " 1   小区名称     8223 non-null   object\n",
      " 2   户型       8223 non-null   object\n",
      " 3   面积(㎡)    8223 non-null   object\n",
      " 4   价格(元/月)  8223 non-null   int64 \n",
      "dtypes: int64(1), object(4)\n",
      "memory usage: 321.3+ KB\n"
     ]
    }
   ],
   "source": [
    "file_data.info()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-12-13T08:03:20.560823Z",
     "start_time": "2024-12-13T08:03:20.553956Z"
    }
   },
   "id": "97e0344844e93221",
   "execution_count": 4
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "#10.2 数据处理"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "10de2904f9c0a909"
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "14123efd5bd4b6ab"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "##10.3.1 重复值检测与处理"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e83258f1354cb82"
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "5ad84697bd39b3de"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "      区域     小区名称     户型     面积(㎡)  价格(元/月)\n65    东城    金鱼池东区   2室1厅   66.46平米     6000\n66    东城     凯德华玺  2房间1卫   73.86平米    13000\n67    东城     沙井胡同   2室1厅   25.15平米     7900\n68    东城     沙井胡同   1室1厅    12.3平米     4200\n69    东城  安贞苑50号院   2室1厅   65.82平米     6900\n...   ..      ...    ...       ...      ...\n8196  顺义     仓上小区   3室1厅  108.03平米     3900\n8197  顺义     石园东区   2室1厅   91.93平米     4100\n8198  顺义     裕龙三区   1室1厅   69.04平米     3800\n8199  顺义     建新北区   2室1厅   50.04平米     3600\n8213  顺义      江山赋   2室1厅   74.62平米     5200\n\n[2450 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>区域</th>\n      <th>小区名称</th>\n      <th>户型</th>\n      <th>面积(㎡)</th>\n      <th>价格(元/月)</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>65</th>\n      <td>东城</td>\n      <td>金鱼池东区</td>\n      <td>2室1厅</td>\n      <td>66.46平米</td>\n      <td>6000</td>\n    </tr>\n    <tr>\n      <th>66</th>\n      <td>东城</td>\n      <td>凯德华玺</td>\n      <td>2房间1卫</td>\n      <td>73.86平米</td>\n      <td>13000</td>\n    </tr>\n    <tr>\n      <th>67</th>\n      <td>东城</td>\n      <td>沙井胡同</td>\n      <td>2室1厅</td>\n      <td>25.15平米</td>\n      <td>7900</td>\n    </tr>\n    <tr>\n      <th>68</th>\n      <td>东城</td>\n      <td>沙井胡同</td>\n      <td>1室1厅</td>\n      <td>12.3平米</td>\n      <td>4200</td>\n    </tr>\n    <tr>\n      <th>69</th>\n      <td>东城</td>\n      <td>安贞苑50号院</td>\n      <td>2室1厅</td>\n      <td>65.82平米</td>\n      <td>6900</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>8196</th>\n      <td>顺义</td>\n      <td>仓上小区</td>\n      <td>3室1厅</td>\n      <td>108.03平米</td>\n      <td>3900</td>\n    </tr>\n    <tr>\n      <th>8197</th>\n      <td>顺义</td>\n      <td>石园东区</td>\n      <td>2室1厅</td>\n      <td>91.93平米</td>\n      <td>4100</td>\n    </tr>\n    <tr>\n      <th>8198</th>\n      <td>顺义</td>\n      <td>裕龙三区</td>\n      <td>1室1厅</td>\n      <td>69.04平米</td>\n      <td>3800</td>\n    </tr>\n    <tr>\n      <th>8199</th>\n      <td>顺义</td>\n      <td>建新北区</td>\n      <td>2室1厅</td>\n      <td>50.04平米</td>\n      <td>3600</td>\n    </tr>\n    <tr>\n      <th>8213</th>\n      <td>顺义</td>\n      <td>江山赋</td>\n      <td>2室1厅</td>\n      <td>74.62平米</td>\n      <td>5200</td>\n    </tr>\n  </tbody>\n</table>\n<p>2450 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检测租房数据是否有重复值\n",
    "dup_value = file_data.duplicated()\n",
    "file_data[dup_value==True]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-12-13T08:05:35.670429Z",
     "start_time": "2024-12-13T08:05:35.654091Z"
    }
   },
   "id": "88861521ba4de64f",
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "      区域        小区名称     户型     面积(㎡)  价格(元/月)\n0     东城     万国城MOMA   1室0厅   59.11平米    10000\n1     东城    北官厅胡同2号院   3室0厅   56.92平米     6000\n2     东城       和平里三区   1室1厅   40.57平米     6900\n3     东城        菊儿胡同   2室1厅   57.09平米     8000\n4     东城  交道口北二条35号院   1室1厅   42.67平米     5500\n...   ..         ...    ...       ...      ...\n5768  顺义        怡馨家园   3室1厅  114.03平米     5500\n5769  顺义      旭辉26街区  4房间2卫      59平米     5000\n5770  顺义     前进花园玉兰苑   3室1厅   92.41平米     5800\n5771  顺义        双裕小区   2室1厅   71.81平米     4200\n5772  顺义       樱花园二区   1室1厅   35.43平米     2700\n\n[5773 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>区域</th>\n      <th>小区名称</th>\n      <th>户型</th>\n      <th>面积(㎡)</th>\n      <th>价格(元/月)</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>东城</td>\n      <td>万国城MOMA</td>\n      <td>1室0厅</td>\n      <td>59.11平米</td>\n      <td>10000</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>东城</td>\n      <td>北官厅胡同2号院</td>\n      <td>3室0厅</td>\n      <td>56.92平米</td>\n      <td>6000</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>东城</td>\n      <td>和平里三区</td>\n      <td>1室1厅</td>\n      <td>40.57平米</td>\n      <td>6900</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>东城</td>\n      <td>菊儿胡同</td>\n      <td>2室1厅</td>\n      <td>57.09平米</td>\n      <td>8000</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>东城</td>\n      <td>交道口北二条35号院</td>\n      <td>1室1厅</td>\n      <td>42.67平米</td>\n      <td>5500</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5768</th>\n      <td>顺义</td>\n      <td>怡馨家园</td>\n      <td>3室1厅</td>\n      <td>114.03平米</td>\n      <td>5500</td>\n    </tr>\n    <tr>\n      <th>5769</th>\n      <td>顺义</td>\n      <td>旭辉26街区</td>\n      <td>4房间2卫</td>\n      <td>59平米</td>\n      <td>5000</td>\n    </tr>\n    <tr>\n      <th>5770</th>\n      <td>顺义</td>\n      <td>前进花园玉兰苑</td>\n      <td>3室1厅</td>\n      <td>92.41平米</td>\n      <td>5800</td>\n    </tr>\n    <tr>\n      <th>5771</th>\n      <td>顺义</td>\n      <td>双裕小区</td>\n      <td>2室1厅</td>\n      <td>71.81平米</td>\n      <td>4200</td>\n    </tr>\n    <tr>\n      <th>5772</th>\n      <td>顺义</td>\n      <td>樱花园二区</td>\n      <td>1室1厅</td>\n      <td>35.43平米</td>\n      <td>2700</td>\n    </tr>\n  </tbody>\n</table>\n<p>5773 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除重复值，重新分配行索引\n",
    "file_data = file_data.drop_duplicates(ignore_index=True)\n",
    "file_data"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-12-13T08:05:48.665352Z",
     "start_time": "2024-12-13T08:05:48.649523Z"
    }
   },
   "id": "a7b5c5cf874ac300",
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "8eed7a995edc3a03"
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "912e4c36019cf8f0"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
